Setup

# Library -----------------------------------------------------------------
library(tidyverse)
library(pROC)

library(foreach)      # For parallel looping
library(doParallel) 

library(ggpubr)
library(rlist)

set.seed(123)


# lists to save results for final comparisons 
## model coefficients 
list_coefficients <- list()
## calibration plots 
list_calibration_plots <- list()

## ROC curve with AUC 
list_ROC <- list()

## ECE 
list_ECE <- list()

## Brier score 
list_Brier <- list()

# Functions ---------------------------------------------------------------
expit <- function(x) {
  exp(x) / (1 + exp(x))
}

logit <- function(x) {
  log(x / (1 - x))
}

simDat <- function(N, beta0, beta1, beta2, beta12) {
  # Gaussian covariates
X1 <- rnorm(n = N, mean = 0, sd = 1)
X2 <- rnorm(n = N, mean = 0, sd = 1)

# linear predictor
lin_pred <- beta0 + X1 * beta1 + X2 * beta2 + X1 * X2 * beta12

# outcome
Y <- rbinom(n = N, size = 1, prob = expit(lin_pred))


list(X = cbind(X1, X2), 
     lin_pred = lin_pred, 
     Y = Y
     ) %>% 
  return()
}

# calibration plot 
calibrationPlot <- function(data, caption) {
  
  ggplot(data = data, 
         aes(x = pred, 
             y = Y)) +
    geom_point(alpha = 0.3) +
    geom_smooth(method = "gam", 
                formula = y ~ s(x, bs = "cs")) +
    theme_bw() +
    labs(x = "Predicted probabilities", 
         y = "Observation", 
         caption = caption) 
}

ROCplot <- function(data, caption) {
  roc_temp <- roc(data$Y, data$pred)
  
  
  ggplot(data = data.frame(sens = roc_temp$sensitivities,
                           spec = roc_temp$specificities), 
       aes(y = sens, 
           x = 1 - spec)) +
  geom_line() +
  theme_bw() +
  labs(y = "Sensitivity", 
       x = "1 - Specificity", 
       title = paste("AUC = ", round(roc_temp$auc, 2)), 
       caption = caption) +
  geom_abline(aes(intercept = 0, 
                  slope = 1), 
              linetype = "dashed", 
              color = "grey") %>% 
    return()
}

ECE <- function(obs, pred) {
  mean(abs(obs - pred))
}

Brier <- function(obs, pred) {
  mean((obs - pred)^2)
}

predict_for_beta <- function(data, beta) {
  # design matrix 
  Xdesign <- data.frame(Intercept = 1, 
                        X1 = data$X1, 
                        X2 = data$X2, 
                        Interaction = data$X1 * data$X2) %>% 
    as.matrix()
  # linear predictor 
  lin_pred <- Xdesign %*% beta 
  
  # prediction 
  expit(lin_pred)[, 1] %>% 
    return()
}

# function to optimize AUC 
AUC_optimization <- function(data, step, 
                             grid_search, 
                             Xdesign) {
  beta <- grid_search[step, ]
  
  # compute prediction 
  lin_pred <- Xdesign %*% beta 
  pred <- expit(lin_pred)[, 1]
  
  # estimate AUC 
  roc(data$Y, pred)$auc %>% 
    return()
}

ECE_optimization <- function(data, step, 
                             grid_search, 
                             Xdesign) {
  beta <- grid_search[step, ]
  
  # compute prediction 
  lin_pred <- Xdesign %*% beta 
  pred <- expit(lin_pred)[, 1]
  
  # estimate AUC 
  ECE(obs = data$Y, pred = pred) %>% 
    return()
}


Brier_optimization <- function(data, step, 
                             grid_search, 
                             Xdesign) {
  beta <- grid_search[step, ]
  
  # compute prediction 
  lin_pred <- Xdesign %*% beta 
  pred <- expit(lin_pred)[, 1]
  
  # estimate AUC 
  Brier(obs = data$Y, pred = pred) %>% 
    return()
}

Data

# load saved R workspace 
load("reproduce.RData")


# Simulation --------------------------------------------------------------
# true OR coefficients
beta0 <- 1
beta1 <- 2.5
beta2 <- 4
beta12 <- 3

# train_dat <- simDat(N = 10000, 
#                     beta0 = beta0, beta1 = beta1, beta2 = beta2, 
#                     beta12 = beta12)
# 
# val_dat <- simDat(N = 1000, 
#                   beta0 = beta0, beta1 = beta1, beta2 = beta2, 
#                   beta12 = beta12)
# 
# # transform into data frames 
# train_dat_df <- data.frame(Y = train_dat$Y) %>% 
#   cbind(train_dat$X)
# 
# val_dat_df <- data.frame(Y = val_dat$Y) %>% 
#   cbind(val_dat$X)


# distribution of probabilities
ggplot(data = data.frame(x = expit(train_dat$lin_pred)),
       aes(x = x)) +
  geom_density() +
  theme_bw() +
  labs(x = "Probability")

ggplot(data = data.frame(x = expit(train_dat$lin_pred),
                         y = train_dat$Y),
       aes(x = x,
           y = y)) +
  geom_point() +
  theme_bw() +
  labs(x = "Probability",
       y = "Outcome")

Maximum Likelihood Estimation

Maximum likelihood estimation is the classical standard approach to estimating coefficients for a logistic regression.

Model fitting: correct model specification with intercept and interaction

# model fitting according to IWLS 
ML_mod <- glm(Y ~ 1 + X1 + X2 + X1*X2, 
    data = train_dat_df, 
    family = "binomial") 

# model coefficients 
ML_mod_coef <- summary(ML_mod)$coef %>% 
  as.data.frame() %>% 
  mutate(True = c(beta0, beta1, beta2 , beta12), 
         Coef = rownames(summary(ML_mod)$coef))

list_coefficients[["ML_CMS"]] <- ggplot(data = ML_mod_coef, 
       aes(x = Coef, 
           y = Estimate)) +
  geom_point() +
  geom_errorbar(aes(ymin = Estimate - 1.96 * `Std. Error`, 
                    ymax = Estimate + 1.96 * `Std. Error`)) +
  geom_point(aes(y = True, 
                 color = "True coefficient"), 
             shape = 4, size = 3) +
  theme_bw() +
  theme(legend.position = "bottom") +
  labs(x = "Model coefficient", 
       color = "", 
       caption = "ML estimation, correct") +
  coord_flip()

AUC_train_ML <- roc(train_dat_df$Y, ML_mod$fitted.values)$auc


# predict probabilities for validation data 
val_dat_df$pred <- predict(ML_mod, 
                        newdata = val_dat_df, 
                        type = "response")



# calibration plot 
list_calibration_plots[["ML_CMS"]] <- calibrationPlot(data = val_dat_df, 
                caption = "ML estimation, correct")


# ROC 
list_ROC[["ML_CMS"]] <- ROCplot(data = val_dat_df, 
        caption = "ML estimation, correct")

# ECE / Brier score 
list_ECE[["ML_CMS"]] <- ECE(val_dat_df$Y, val_dat_df$pred)
list_Brier[["ML_CMS"]] <- Brier(val_dat_df$Y, val_dat_df$pred)

Model fitting: incorrect model without interaction

# model fitting according to IWLS 
ML_mod <- glm(Y ~ 1 + X1 + X2, 
    data = train_dat_df, 
    family = "binomial") 

# model coefficients 
ML_mod_coef <- summary(ML_mod)$coef %>% 
  as.data.frame() %>% 
  mutate(True = c(beta0, beta1, beta2), 
         Coef = rownames(summary(ML_mod)$coef))

list_coefficients[["ML_ICMS"]]  <- ggplot(data = ML_mod_coef, 
       aes(x = Coef, 
           y = Estimate)) +
  geom_point() +
  geom_errorbar(aes(ymin = Estimate - 1.96 * `Std. Error`, 
                    ymax = Estimate + 1.96 * `Std. Error`)) +
  geom_point(aes(y = True, 
                 color = "True coefficient"), 
             shape = 4, size = 3) +
  theme_bw() +
  theme(legend.position = "bottom") +
  labs(x = "Model coefficient", 
       color = "", 
       caption = "ML estimation, incorrect") +
  coord_flip()

# predict probabilities for validation data 
val_dat_df$pred <- predict(ML_mod, 
                        newdata = val_dat_df, 
                        type = "response")



# calibration plot 
list_calibration_plots[["ML_ICMS"]] <- calibrationPlot(data = val_dat_df, 
                caption = "ML estimation, incorrect")


# ROC 
list_ROC[["ML_ICMS"]] <- ROCplot(data = val_dat_df, 
        caption = "ML estimation, incorrect")

# ECE / Brier score 
list_ECE[["ML_ICMS"]] <- ECE(val_dat_df$Y, val_dat_df$pred)
list_Brier[["ML_ICMS"]] <- Brier(val_dat_df$Y, val_dat_df$pred)

Minimization of criterion

# Set up for brute force grid search 
# 4 dimensional space for searching 
grid_search <- expand.grid(beta0 = seq(-3, 3, by = 0.5), 
                           beta1 = seq(-1, 3.5, by = 0.1), 
                           beta2 = seq(2, 5, by = 0.1), 
                           beta12 = seq(2, 5, by = 0.1)) %>% 
  as.matrix()

Xdesign <- data.frame(Intercept = 1, 
                      X1 = train_dat_df$X1, 
                      X2 = train_dat_df$X2, 
                      Interaction = train_dat_df$X1 * train_dat_df$X2) %>% 
  as.matrix()

AUC

Intercept coefficient redundant, as all it does is scale the probability. It is not identifiable based on a discrimination metric.

# find best performing AUC based on brute-force grid search 
n_steps <- nrow(grid_search)

# Setup parallel backend (adjust 'cores' based on your system)
# cores <- detectCores() - 1  # Leave one core free
# 
# cl <- makeCluster(cores)
# registerDoParallel(cl)
# 
# # Perform grid search with parallelization
# auc_results <- foreach(step = 1:n_steps, 
#                        .combine = rbind, 
#                        .packages = c("pROC", "dplyr")) %dopar% {
#   auc_value <- AUC_optimization(data = train_dat_df,
#                                 step = step, 
#                                 grid_search = grid_search, 
#                                 Xdesign = Xdesign)
#   c(step, auc_value)  # Return step and corresponding AUC value
# }
# 
# # Stop the parallel backend
# stopCluster(cl)


AUC_results_df <- data.frame(step = auc_results[, 1], 
                             AUC = auc_results[, 2]) %>% 
  mutate(beta0 = grid_search[step, 1], 
         beta1 = grid_search[step, 2], 
         beta2 = grid_search[step, 3], 
         beta12 = grid_search[step, 4])

AUC_results_df_long <- AUC_results_df %>% 
  gather(coef, value, -c(step, AUC))


# visualize results 
ggplot(data = AUC_results_df_long, 
       aes(x = value, 
           y = AUC, 
           color = coef)) +
  geom_point() +
  geom_hline(aes(yintercept = AUC_train_ML)) +
  theme_bw() +
  theme(legend.position = "bottom") +
  labs(color = "") +
  facet_grid(~ coef)

# Find the best parameters (maximizing AUC)
best_AUC <- AUC_results_df %>% filter(AUC == max(AUC))


# best beta 
AUC_beta <- c(0, best_AUC$beta1, best_AUC$beta2, best_AUC$beta12) %>% 
  unique()

# predict for best beta
val_dat_df$pred <- predict_for_beta(data = val_dat_df, 
                                    beta = AUC_beta)


# calibration plot 
list_calibration_plots[["AUC"]] <- calibrationPlot(data = val_dat_df, 
                caption = "AUC optimization")


# ROC 
list_ROC[["AUC"]] <- ROCplot(data = val_dat_df, 
        caption = "AUC optimization")

# ECE / Brier score 
list_ECE[["AUC"]] <- ECE(val_dat_df$Y, val_dat_df$pred)
list_Brier[["AUC"]] <- Brier(val_dat_df$Y, val_dat_df$pred)

ECE

# cl <- makeCluster(cores)
# registerDoParallel(cl)
# 
# # Perform grid search with parallelization
# ece_results <- foreach(step = 1:n_steps, 
#                        .combine = rbind, 
#                        .packages = c("dplyr")) %dopar% {
#   ece_value <- ECE_optimization(data = train_dat_df,
#                                 step = step, 
#                                 grid_search = grid_search, 
#                                 Xdesign = Xdesign)
#   c(step, ece_value)  # Return step and corresponding AUC value
# }
# 
# # Stop the parallel backend
# stopCluster(cl)


ECE_results_df <- data.frame(step = ece_results[, 1], 
                             ECE = ece_results[, 2]) %>% 
  mutate(beta0 = grid_search[step, 1], 
         beta1 = grid_search[step, 2], 
         beta2 = grid_search[step, 3], 
         beta12 = grid_search[step, 4])

ECE_results_df_long <- ECE_results_df %>% 
  gather(coef, value, -c(step, ECE))


# visualize results 
ggplot(data = ECE_results_df_long, 
       aes(x = value, 
           y = ECE, 
           color = coef)) +
  geom_point() +
  geom_hline(aes(yintercept = AUC_train_ML)) +
  theme_bw() +
  theme(legend.position = "bottom") +
  labs(color = "") +
  facet_grid(~ coef)

# Find the best parameters (maximizing AUC)
best_ECE <- ECE_results_df %>% filter(ECE == min(ECE))


# best beta 
ECE_beta <- c(best_ECE$beta0, best_ECE$beta1, best_ECE$beta2, best_ECE$beta12)

# predict for best beta
val_dat_df$pred <- predict_for_beta(data = val_dat_df, 
                                    beta = ECE_beta)


# calibration plot 
list_calibration_plots[["ECE"]] <- calibrationPlot(data = val_dat_df, 
                caption = "ECE optimization")


# ROC 
list_ROC[["ECE"]] <- ROCplot(data = val_dat_df, 
        caption = "ECE optimization")

# ECE / Brier score 
list_ECE[["ECE"]] <- ECE(val_dat_df$Y, val_dat_df$pred)
list_Brier[["ECE"]] <- Brier(val_dat_df$Y, val_dat_df$pred)

Brier score

# cl <- makeCluster(cores)
# registerDoParallel(cl)
# 
# # Perform grid search with parallelization
# brier_results <- foreach(step = 1:n_steps, 
#                        .combine = rbind, 
#                        .packages = c("dplyr")) %dopar% {
#   brier_value <- Brier_optimization(data = train_dat_df,
#                                 step = step, 
#                                 grid_search = grid_search, 
#                                 Xdesign = Xdesign)
#   c(step, brier_value)  # Return step and corresponding AUC value
# }
# 
# # Stop the parallel backend
# stopCluster(cl)


Brier_results_df <- data.frame(step = brier_results[, 1], 
                             Brier = brier_results[, 2]) %>% 
  mutate(beta0 = grid_search[step, 1], 
         beta1 = grid_search[step, 2], 
         beta2 = grid_search[step, 3], 
         beta12 = grid_search[step, 4])

Brier_results_df_long <- Brier_results_df %>% 
  gather(coef, value, -c(step, Brier))


# visualize results 
ggplot(data = Brier_results_df_long, 
       aes(x = value, 
           y = Brier, 
           color = coef)) +
  geom_point() +
  geom_hline(aes(yintercept = AUC_train_ML)) +
  theme_bw() +
  theme(legend.position = "bottom") +
  labs(color = "") +
  facet_grid(~ coef)

# Find the best parameters (maximizing AUC)
best_Brier <- Brier_results_df %>% filter(Brier == min(Brier))


# best beta 
Brier_beta <- c(best_Brier$beta0, best_Brier$beta1, best_Brier$beta2, best_Brier$beta12)

# predict for best beta
val_dat_df$pred <- predict_for_beta(data = val_dat_df, 
                                    beta = Brier_beta)


# calibration plot 
list_calibration_plots[["Brier"]] <- calibrationPlot(data = val_dat_df, 
                caption = "Brier score optimization")


# ROC 
list_ROC[["Brier"]] <- ROCplot(data = val_dat_df, 
        caption = "Brier score optimization")

# ECE / Brier score 
list_ECE[["Brier"]] <- ECE(val_dat_df$Y, val_dat_df$pred)
list_Brier[["Brier"]] <- Brier(val_dat_df$Y, val_dat_df$pred)

Coefficients

# coefficient estimates 
coef_results <- data.frame(AUC = AUC_beta, 
                           ECE = ECE_beta, 
                           Brier = Brier_beta, 
                           True = c(beta0, beta1, beta2, beta12), 
                           Coef = c("(Intercept)", "X1", "X2", "X1:X2")) %>% 
  gather(Method, Estimate, -c(Coef, True))


list_coefficients[["AUC_ECE_Brier"]] <- ggplot(data = coef_results, 
       aes(x = Coef, 
           y = Estimate, 
           color = Method)) +
  geom_point(position = position_jitter(width = 0.2, height = 0)) +
  geom_point(aes(y = True, 
                 color = "True coefficient"), 
             shape = 4, size = 3) +
  theme_bw() +
  theme(legend.position = "bottom") +
  labs(x = "Model coefficient", 
       color = "", 
       caption = "AUC / Brier / ECE") +
  coord_flip()

Comparison

Based on validation data set.

ggarrange(list_coefficients$ML_CMS, 
          list_coefficients$ML_ICMS, 
          list_coefficients$AUC_ECE_Brier, 
          nrow = 2, ncol = 2)

# calibration plots 
ggarrange(list_calibration_plots$ML_CMS, 
          list_calibration_plots$ML_ICMS,
          list_calibration_plots$AUC,
          list_calibration_plots$ECE, 
          list_calibration_plots$Brier, 
          nrow = 2, ncol = 3)

# AUC 
ggarrange(list_ROC$ML_CMS, 
          list_ROC$ML_ICMS, 
          list_ROC$AUC, 
          list_ROC$ECE, 
          list_ROC$Brier)

# ECE 
list_ECE %>% list.cbind() %>% knitr::kable()
ML_CMS ML_ICMS AUC ECE Brier
0.1808541 0.2429615 0.177698 0.1624228 0.1817578
# Brier score 
list_Brier %>% list.cbind() %>% knitr::kable()
ML_CMS ML_ICMS AUC ECE Brier
0.0941789 0.1187399 0.1074768 0.0988773 0.0940754

Save

# save(list_calibration_plots, file = "list_calibration_plots.RData")
# save(list_ROC, file = "list_ROC.RData")
# save(list_ECE, file = "list_ECE.RData")
# save(list_Brier, file = "list_Brier.RData")


# save entire workspace 
# save.image(file = "reproduce.RData")